
*time:2020/12/22

cd "D:\Statalearning\do\63随机森林"

import excel using "default of credit card clients.xls", clear firstrow
rename *, lower  //和原论文对应，将变量字母变小写
label define marriage_label 0 missing 1 married 2 single 3 other
label values marriage marriage_label
tabulate marriage, generate(marriage_enum)


   MARRIAGE |      Freq.     Percent        Cum.
------------+-----------------------------------
    missing |         54        0.18        0.18
    married |     13,659       45.53       45.71
     single |     15,964       53.21       98.92
      other |        323        1.08      100.00
------------+-----------------------------------
      Total |     30,000      100.00

set seed 201807
generate u=uniform()
sort u, stable

// figure out how large the value of iterations need to be
gen out_of_bag_error1 = .
gen validation_error = .
gen iter1 = .
local j = 0
forvalues i = 10(5)500 {
     local j = `j' + 1

     rforest defaultpaymentnextmonth limit_bal sex education marriage_enum* age pay* bill* in 1/1500, type(class) iter(`i') numvars(1)
     qui replace iter1 = `i' in `j'
     qui replace out_of_bag_error1 = `e(OOB_Error)' in `j'
     predict p in 1501/3000
     qui replace validation_error = `e(error_rate)' in `j'
     drop p
}

set scheme s1mono
label var out_of_bag_error1 "Out-of-bag error"
label var iter1 "Iterations"
label var validation_error "Validation error"
scatter out_of_bag_error1 iter1, mcolor(blue) msize(tiny) ||  scatter validation_error iter1, mcolor(red) msize(tiny)

graph save fig3.gph,replace 



gen oob_error = .
gen nvars = .
gen val_error = .
local j = 0
forvalues i = 1(1)26{
     local j = `j' + 1
     rforest defaultpaymentnextmonth limit_bal sex ///
     education marriage_enum* age pay* bill* in 1/1500, type(class) ///
     iter(500) numvars(`i')
     qui replace nvars = `i' in `j'
     qui replace oob_error = `e(OOB_Error)' in `j'
     predict p in 1501/3000
     qui replace val_error = `e(error_rate)' in `j'
     drop p
}
label var oob_error "Out-of-bag error"
label var val_error "Validation error"
label var nvars "Number of variables randomly selected at each split"
scatter oob_error nvars, mcolor(blue) msize(tiny) ||   ///
scatter val_error nvars, mcolor(red) msize(tiny)

graph save fig4.gph, replace
//cap graph save oob_and_val_error_numvars.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name


cap frame drop mydata
// only run when tuning is run
frame put val_error nvars, into(mydata)
frame mydata { 
   sort val_error, stable
   local min_val_err = val_error[1]
   local min_nvars = nvars[1]
}
frame drop mydata
local min_val_err : display %9.4f `min_val_err'
local min_nvars = `min_nvars' 
di "Minimum Error: `min_val_err'; Corresponding number of variables `min_nvars'"
//list val_error if nvars==18
//texdoc sum val_error , nolog nooutput  // assuming that 18 is at the minimum

// final model: numvars = 18, iter = 1000
rforest defaultpaymentnextmonth limit_bal sex education marriage_enum* age pay* bill* in 1/1500, type(class) iter(1000) numvars(11)
di e(OOB_Error)

// I cannot save scalars
// texdoc local only saves them until the next texdoc stlog starts
// assigning a global macro does not work with the "no_do"  option

predict prf in 1501/3000
di e(error_rate)

matrix importance = e(importance)
svmat importance
gen importid=""

local mynames : rownames importance
local k : word count `mynames'
if `k'>_N {
     set obs `k'
}
forvalues i = 1(1)`k' {
     local aword : word `i' of `mynames'
     local alabel : variable label `aword'
     if ("`alabel'"!="") qui replace importid= "`alabel'" in `i'
     else qui replace importid= "`aword'" in `i'
}

graph hbar (mean) importance, over(importid, sort(1) label(labsize(2))) ///
     ytitle(Importance)
graph save fig5.gph, replace

//cap graph save 1importance.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name



set scheme s1mono

twoway (hist limit_bal if defaultpaymentnextmonth == 0) (hist limit_bal if defaultpaymentnextmonth == 1,  fcolor(none) lcolor(black)), legend(order(1 "no default" 2 "default" ))

graph save fig6.gph,replace
//cap graph save 1histogram.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name


// If I change the sort order logistic regression will be affected
logistic defaultpaymentnextmonth limit_bal sex education  marriage_enum* age pay* bill* in 1/1500
    
predict plogit in 1501/3000
replace plogit = 0 if plogit <= 0.5 & plogit != .
replace plogit = 1 if plogit > 0.5 & plogit != .
gen error = plogit != defaultpaymentnextmonth
sum error in 1501/3000
local logit_acc= strofreal(100*r(mean), "%9.2f")


clear
// always run because it reorders data
import delimited OnlineNewsPopularity.csv
set seed 201807
gen u = uniform()
sort u, stable
gen logShares = ln(shares)

// replace logShares = 0.01 if logShares == .  




gen out_of_bag_error1 = .
gen validation_error = .
gen iter1 = .
local j = 0
forvalues i = 10(5)100 {
     local j = `j' + 1
     rforest logShares n_* average_* num_* ///
     data_* kw_* self_* weekday_* lda_* global_* ///
     is_weekend rate_* min_* max_* avg_* title_* abs_* in 1/19822, ///
     type(reg) iter(`i') numvars(1)
     qui replace iter1 = `i' in `j'
     qui replace out_of_bag_error1 = `e(OOB_Error)' in `j'
     predict p in 19823/39644
     qui replace validation_error = `e(RMSE)' in `j'
     drop p
}
label var out_of_bag_error1 "Out-of-bag error"
label var iter1 "Iterations"
label var validation_error "Validation RMSE"
scatter out_of_bag_error1 iter1, mcolor(blue) msize(tiny) || ///
   scatter validation_error iter1, mcolor(red) msize(tiny)
graph save fig7.gph,replace 
graph export rforest7.eps, as(eps) replace
//cap graph save 2scatter_oob.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name



gen oob_error = .
gen nvars = .
gen val_error = .
local j = 0
forvalues i = 1(1)58{
     local j = `j' + 1
     rforest logShares n_* average_* num_* ///
     data_* kw_* self_* weekday_* lda_* global_* ///
     is_weekend rate_* min_* max_* avg_* title_* abs_* in 1/1982, ///
     type(reg) iter(100) numvars(`i') 
     qui replace nvars = `i' in `j'
     qui replace oob_error = `e(OOB_Error)' in `j'
     predict p in 1983/3964
     qui replace val_error = `e(RMSE)' in `j'
     drop p
}
label var oob_error "Out-of-bag error"
label var val_error "Validation RMSE"
label var nvars "Number of variables randomly selected at each split"
scatter oob_error nvars, mcolor(blue) msize(tiny) || ///
   scatter val_error nvars, mcolor(red) msize(tiny)

graph save fig8.gph,replace 

graph export rforest8.eps, as(eps) replace
//cap graph save 2scatter_numvars.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name




cap frame drop mydata2
// only run when tuning is run
frame put val_error nvars, into(mydata2)
frame mydata2 { 
   sort val_error, stable
   local min_val_err = val_error[1]
   local min_nvars = nvars[1]
}
frame drop mydata2
local min_val_err : display %9.4f `min_val_err'
local min_nvars = `min_nvars' 
di "Minimum Error: `min_val_err'; Corresponding number of variables `min_nvars'"


// certify this
rforest logShares n_* average_* num_* ///
     data_* kw_* self_* weekday_* lda_* global_* ///
     is_weekend rate_* min_* max_* avg_* title_* abs_* in 1/1982, ///
     type(reg) iter(100) numvars(21) 
ereturn list OOB_Error
predict prf in 1983/3964
ereturn list RMSE 
local OOB_error : display %9.4f `e(OOB_Error)'
local RMSE : display %9.4f `e(RMSE)'



// variable importance plot
matrix importance2 = e(importance)
svmat importance2
gen importid2=""

local mynames : rownames importance2
local k : word count `mynames'
if `k'>_N {
     set obs `k'
}
forvalues i = 1(1)`k' {
     local aword : word `i' of `mynames'
     local alabel : variable label `aword'
     if ("`alabel'"!="") qui replace importid2= "`alabel'" in `i'
     else qui replace importid2= "`aword'" in `i'
}

graph hbar (mean) importance2 if importance2>.4, over(importid2, sort(1) ///
     label(labsize(2))) ytitle(Importance)
	 
graph save fig9.gph,replace 
	 
graph export rforest9.eps, as(eps) replace
//cap graph save 2importance.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name




twoway (hist logShares if is_weekend == 0) ///
   (hist logShares if is_weekend == 1, fcolor(none) lcolor(black)), ///
   legend(order(1 "weekday" 2 "weekend" ))
   
graph save fig10.gph,replace 

graph export rforest10.eps, as(eps) replace
//cap graph save 2histogram.gph, replace 
//cap graph close // prevent that the graph is accidentally saved below under a different name


regress logShares n_* average_* num_* ///
     data_* kw_* self_* weekday_* lda_* global_* ///
     is_weekend rate_* min_* max_* avg_* title_* abs_* in 1/1982
predict pregress in 1983/3964
ereturn list rmse


gen diff_sqr= (logShares - pregress)^2
summarize diff_sqr


















